{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# EDA数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# kaggle数据名：tabular-playground-series-nov-2021\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.model_selection import train_test_split"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_df = pd.read_csv('datasets/train.csv', index_col='id')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 基本信息\n",
    "\n",
    "一共有600000条数据，100个特征。全部是`float`类型。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Int64Index: 600000 entries, 0 to 599999\n",
      "Columns: 101 entries, f0 to target\n",
      "dtypes: float64(100), int64(1)\n",
      "memory usage: 466.9 MB\n"
     ]
    }
   ],
   "source": [
    "train_df.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>f0</th>\n",
       "      <th>f1</th>\n",
       "      <th>f2</th>\n",
       "      <th>f3</th>\n",
       "      <th>f4</th>\n",
       "      <th>f5</th>\n",
       "      <th>f6</th>\n",
       "      <th>f7</th>\n",
       "      <th>f8</th>\n",
       "      <th>f9</th>\n",
       "      <th>...</th>\n",
       "      <th>f91</th>\n",
       "      <th>f92</th>\n",
       "      <th>f93</th>\n",
       "      <th>f94</th>\n",
       "      <th>f95</th>\n",
       "      <th>f96</th>\n",
       "      <th>f97</th>\n",
       "      <th>f98</th>\n",
       "      <th>f99</th>\n",
       "      <th>target</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>600000.000000</td>\n",
       "      <td>600000.000000</td>\n",
       "      <td>600000.000000</td>\n",
       "      <td>600000.000000</td>\n",
       "      <td>600000.000000</td>\n",
       "      <td>600000.000000</td>\n",
       "      <td>600000.000000</td>\n",
       "      <td>600000.000000</td>\n",
       "      <td>600000.000000</td>\n",
       "      <td>600000.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>600000.000000</td>\n",
       "      <td>600000.000000</td>\n",
       "      <td>600000.000000</td>\n",
       "      <td>600000.000000</td>\n",
       "      <td>600000.000000</td>\n",
       "      <td>600000.000000</td>\n",
       "      <td>600000.000000</td>\n",
       "      <td>600000.000000</td>\n",
       "      <td>600000.000000</td>\n",
       "      <td>600000.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>0.306508</td>\n",
       "      <td>2.497590</td>\n",
       "      <td>306.644536</td>\n",
       "      <td>2.647901</td>\n",
       "      <td>0.177850</td>\n",
       "      <td>2.556832</td>\n",
       "      <td>2.699650</td>\n",
       "      <td>2.571593</td>\n",
       "      <td>2.538273</td>\n",
       "      <td>0.134370</td>\n",
       "      <td>...</td>\n",
       "      <td>2.444471</td>\n",
       "      <td>0.155260</td>\n",
       "      <td>0.059407</td>\n",
       "      <td>0.144932</td>\n",
       "      <td>0.106419</td>\n",
       "      <td>2.547853</td>\n",
       "      <td>2.590159</td>\n",
       "      <td>0.158881</td>\n",
       "      <td>0.123048</td>\n",
       "      <td>0.506010</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>0.522450</td>\n",
       "      <td>1.554018</td>\n",
       "      <td>551.743893</td>\n",
       "      <td>1.544529</td>\n",
       "      <td>0.417488</td>\n",
       "      <td>1.562527</td>\n",
       "      <td>1.564000</td>\n",
       "      <td>1.549361</td>\n",
       "      <td>1.532988</td>\n",
       "      <td>0.421892</td>\n",
       "      <td>...</td>\n",
       "      <td>1.542509</td>\n",
       "      <td>0.548397</td>\n",
       "      <td>0.119426</td>\n",
       "      <td>0.462015</td>\n",
       "      <td>0.209128</td>\n",
       "      <td>1.558427</td>\n",
       "      <td>1.525091</td>\n",
       "      <td>0.436190</td>\n",
       "      <td>0.264896</td>\n",
       "      <td>0.499964</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>-3.797450</td>\n",
       "      <td>-1.223960</td>\n",
       "      <td>-1842.530000</td>\n",
       "      <td>-1.368560</td>\n",
       "      <td>-3.206210</td>\n",
       "      <td>-1.169770</td>\n",
       "      <td>-1.059310</td>\n",
       "      <td>-1.281970</td>\n",
       "      <td>-1.242020</td>\n",
       "      <td>-2.577840</td>\n",
       "      <td>...</td>\n",
       "      <td>-1.217700</td>\n",
       "      <td>-9.761770</td>\n",
       "      <td>-4.666240</td>\n",
       "      <td>-3.101500</td>\n",
       "      <td>-1.276540</td>\n",
       "      <td>-1.584740</td>\n",
       "      <td>-1.254730</td>\n",
       "      <td>-3.993500</td>\n",
       "      <td>-2.783380</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>0.026222</td>\n",
       "      <td>1.186237</td>\n",
       "      <td>43.573400</td>\n",
       "      <td>1.442028</td>\n",
       "      <td>0.019709</td>\n",
       "      <td>1.261038</td>\n",
       "      <td>1.385820</td>\n",
       "      <td>1.333848</td>\n",
       "      <td>1.292163</td>\n",
       "      <td>0.019563</td>\n",
       "      <td>...</td>\n",
       "      <td>1.214177</td>\n",
       "      <td>0.018904</td>\n",
       "      <td>0.024483</td>\n",
       "      <td>0.017055</td>\n",
       "      <td>0.025461</td>\n",
       "      <td>1.247888</td>\n",
       "      <td>1.348078</td>\n",
       "      <td>0.013536</td>\n",
       "      <td>0.018105</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>0.097788</td>\n",
       "      <td>2.516500</td>\n",
       "      <td>133.626000</td>\n",
       "      <td>2.634130</td>\n",
       "      <td>0.061586</td>\n",
       "      <td>2.590425</td>\n",
       "      <td>2.801255</td>\n",
       "      <td>2.557985</td>\n",
       "      <td>2.475880</td>\n",
       "      <td>0.058752</td>\n",
       "      <td>...</td>\n",
       "      <td>2.386845</td>\n",
       "      <td>0.068906</td>\n",
       "      <td>0.056649</td>\n",
       "      <td>0.063439</td>\n",
       "      <td>0.062151</td>\n",
       "      <td>2.601940</td>\n",
       "      <td>2.682090</td>\n",
       "      <td>0.058058</td>\n",
       "      <td>0.058471</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>0.397184</td>\n",
       "      <td>3.787630</td>\n",
       "      <td>302.262250</td>\n",
       "      <td>3.907640</td>\n",
       "      <td>0.112712</td>\n",
       "      <td>3.813662</td>\n",
       "      <td>3.996913</td>\n",
       "      <td>3.823450</td>\n",
       "      <td>3.804360</td>\n",
       "      <td>0.101046</td>\n",
       "      <td>...</td>\n",
       "      <td>3.693872</td>\n",
       "      <td>0.125165</td>\n",
       "      <td>0.088162</td>\n",
       "      <td>0.113114</td>\n",
       "      <td>0.102016</td>\n",
       "      <td>3.820665</td>\n",
       "      <td>3.839520</td>\n",
       "      <td>0.110718</td>\n",
       "      <td>0.104872</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>8.781500</td>\n",
       "      <td>6.226720</td>\n",
       "      <td>6119.280000</td>\n",
       "      <td>6.521150</td>\n",
       "      <td>8.265470</td>\n",
       "      <td>6.515070</td>\n",
       "      <td>6.586780</td>\n",
       "      <td>6.258770</td>\n",
       "      <td>6.389670</td>\n",
       "      <td>7.078460</td>\n",
       "      <td>...</td>\n",
       "      <td>6.573890</td>\n",
       "      <td>18.412800</td>\n",
       "      <td>10.211800</td>\n",
       "      <td>8.623270</td>\n",
       "      <td>3.657220</td>\n",
       "      <td>6.254360</td>\n",
       "      <td>6.145300</td>\n",
       "      <td>10.767000</td>\n",
       "      <td>5.988110</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>8 rows × 101 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                  f0             f1             f2             f3  \\\n",
       "count  600000.000000  600000.000000  600000.000000  600000.000000   \n",
       "mean        0.306508       2.497590     306.644536       2.647901   \n",
       "std         0.522450       1.554018     551.743893       1.544529   \n",
       "min        -3.797450      -1.223960   -1842.530000      -1.368560   \n",
       "25%         0.026222       1.186237      43.573400       1.442028   \n",
       "50%         0.097788       2.516500     133.626000       2.634130   \n",
       "75%         0.397184       3.787630     302.262250       3.907640   \n",
       "max         8.781500       6.226720    6119.280000       6.521150   \n",
       "\n",
       "                  f4             f5             f6             f7  \\\n",
       "count  600000.000000  600000.000000  600000.000000  600000.000000   \n",
       "mean        0.177850       2.556832       2.699650       2.571593   \n",
       "std         0.417488       1.562527       1.564000       1.549361   \n",
       "min        -3.206210      -1.169770      -1.059310      -1.281970   \n",
       "25%         0.019709       1.261038       1.385820       1.333848   \n",
       "50%         0.061586       2.590425       2.801255       2.557985   \n",
       "75%         0.112712       3.813662       3.996913       3.823450   \n",
       "max         8.265470       6.515070       6.586780       6.258770   \n",
       "\n",
       "                  f8             f9  ...            f91            f92  \\\n",
       "count  600000.000000  600000.000000  ...  600000.000000  600000.000000   \n",
       "mean        2.538273       0.134370  ...       2.444471       0.155260   \n",
       "std         1.532988       0.421892  ...       1.542509       0.548397   \n",
       "min        -1.242020      -2.577840  ...      -1.217700      -9.761770   \n",
       "25%         1.292163       0.019563  ...       1.214177       0.018904   \n",
       "50%         2.475880       0.058752  ...       2.386845       0.068906   \n",
       "75%         3.804360       0.101046  ...       3.693872       0.125165   \n",
       "max         6.389670       7.078460  ...       6.573890      18.412800   \n",
       "\n",
       "                 f93            f94            f95            f96  \\\n",
       "count  600000.000000  600000.000000  600000.000000  600000.000000   \n",
       "mean        0.059407       0.144932       0.106419       2.547853   \n",
       "std         0.119426       0.462015       0.209128       1.558427   \n",
       "min        -4.666240      -3.101500      -1.276540      -1.584740   \n",
       "25%         0.024483       0.017055       0.025461       1.247888   \n",
       "50%         0.056649       0.063439       0.062151       2.601940   \n",
       "75%         0.088162       0.113114       0.102016       3.820665   \n",
       "max        10.211800       8.623270       3.657220       6.254360   \n",
       "\n",
       "                 f97            f98            f99         target  \n",
       "count  600000.000000  600000.000000  600000.000000  600000.000000  \n",
       "mean        2.590159       0.158881       0.123048       0.506010  \n",
       "std         1.525091       0.436190       0.264896       0.499964  \n",
       "min        -1.254730      -3.993500      -2.783380       0.000000  \n",
       "25%         1.348078       0.013536       0.018105       0.000000  \n",
       "50%         2.682090       0.058058       0.058471       1.000000  \n",
       "75%         3.839520       0.110718       0.104872       1.000000  \n",
       "max         6.145300      10.767000       5.988110       1.000000  \n",
       "\n",
       "[8 rows x 101 columns]"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_df.describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 数据切分"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "X, X_t, y, y_t = train_test_split(\n",
    "    train_df.iloc[:, :-1],\n",
    "    train_df['target'],\n",
    "    test_size=0.2,\n",
    "    random_state=12\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "X.to_csv('datasets/X.csv')\n",
    "X_t.to_csv('datasets/X_t.csv')\n",
    "y.to_csv('datasets/y.csv')\n",
    "y_t.to_csv('datasets/y_t.csv')"
   ]
  }
 ],
 "metadata": {
  "interpreter": {
   "hash": "81ee8574ca4b949f1a68324cabe052a24810371303122f0dfbc70cb842d7a9dc"
  },
  "kernelspec": {
   "display_name": "Python 3.7.4 64-bit ('stats': conda)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
